library(Biostrings)
library(gplots)
## Warning: package 'gplots' was built under R version 3.2.4
library(RColorBrewer)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.4
library(reshape2)
library(plyr)
## Warning: package 'plyr' was built under R version 3.2.5
Tn004.fasta <- readDNAStringSet('Tn004_S1_L001_redundans/cdhit1000.fa')
tet <- oligonucleotideFrequency(Tn004.fasta, 2, as.prob = T)
rownames(tet) <- names(Tn004.fasta)
cl <- hclust(dist(tet), method='ward.D2')
plot(as.dendrogram(cl), leaflab='none')
#hmcol <- rev(colorRampPalette(brewer.pal(9, "YlGnBu"))(255))
#lab.col <- colorRampPalette(brewer.pal(9, 'RdPu'))(255)
#scaled.sizes <- trunc(width(Tn004.2kb)/max(width(Tn004.2kb))*255)
#heatmap.2(tet, dendrogram = 'row', Rowv = as.dendrogram(cl), trace = 'none', labRow = '', scale='row', col=hmcol, lhei = c(0.5, 4), RowSideColors = lab.col[scaled.sizes])
PCA plot - how well does clustering match up with this?
cl.split <- cutree(cl, 5)
table(cl.split)
## cl.split
## 1 2 3 4 5
## 736 772 445 375 85
tet.pca <- prcomp(tet, center = T, scale. = T)
plot(tet.pca, type='l')
scores <- data.frame(rownames(tet), tet.pca$x[,1:3], length=width(Tn004.fasta), cluster=cl.split)
qplot(x=PC1, y=PC2, data=scores, col=as.factor(cluster))
qplot(x=PC1, y=PC3, data=scores, col=as.factor(cluster))
qplot(x=PC2, y=PC3, data=scores, col=as.factor(cluster))
Map dinucleotide frequency data onto contig coverage and SNP rate
fullSummary <- read.delim('Varscan.CovSNPrate.summary.tab', header=T)
clusterData <- data.frame(Chrom = gsub('\\|size[0-9]+', '', names(Tn004.fasta)),
cluster = cl.split)
allData <- merge(subset(fullSummary, Sample=='Tn004'), clusterData, by = 'Chrom', all.x=T)
allData[is.na(allData$cluster),'cluster'] <- 0
#Coverage distributions
ggplot(allData, aes(x=medCov)) + geom_histogram(binwidth=5, alpha=0.2, col='black') + scale_x_continuous(limits = c(0, 150)) + geom_freqpoly(aes(x=medCov, col=as.factor(cluster)), binwidth=5, size=2)
## Warning: Removed 148 rows containing non-finite values (stat_bin).
## Warning: Removed 148 rows containing non-finite values (stat_bin).
## Warning: Removed 10 rows containing missing values (geom_path).
#SNP rate distrubutions
ggplot(allData, aes(x=SNPrate*100)) + geom_histogram(binwidth=0.5, alpha=0.2, col='black') + geom_freqpoly(aes(x=SNPrate*100, col=as.factor(cluster)), binwidth=0.5, size=2) + scale_x_continuous(limits = c(0,10))
## Warning: Removed 38 rows containing non-finite values (stat_bin).
## Warning: Removed 38 rows containing non-finite values (stat_bin).
## Warning: Removed 10 rows containing missing values (geom_path).
# Load SNP-only dataset
snpSummary <- read.delim('Tn004_S1_L001_v_cdhit.VarScanSNPs.tab', header=T)
snpSummary <- cbind(colsplit(snpSummary$Chrom, '\\|size', c("Chrom", "ContigLength")), snpSummary[2:19])
snpSummary$VarFreq <- as.numeric(gsub('%','',snpSummary$VarFreq))
snpSummary['MinorAlleleFreq'] <- ifelse(snpSummary$VarFreq <= 50, snpSummary$VarFreq, 100-snpSummary$VarFreq)
# Merge with cluster data (keep only SNPs that are observed on both strands)
allSNPs <- merge(subset(snpSummary, Strands1 == 2 & Strands2 == 2), clusterData, by = 'Chrom', all.x=T)
ggplot(allSNPs, aes(x=MinorAlleleFreq)) + geom_histogram(binwidth=1, alpha=0.2, col='black') + geom_freqpoly(aes(x=MinorAlleleFreq, col=as.factor(cluster)), binwidth=1, size=2)
blastTab <- read.delim('../../blast_test.tab', header=F)
colnames(blastTab) <- c('Query', 'Hit.id', 'percentID', 'Align.length', 'mismatches', 'gaps', 'qstart', 'qend', 'sstart', 'send')
blast.ss <- subset(blastTab, Align.length > 100)
blast.hit.count <- ddply(blast.ss, 'Query', 'summarise',
hit.count = length(percentID),
av.id = mean(percentID))
scores.w.blast <- merge(scores, blast.hit.count, by.x = 'rownames.tet.', by.y='Query', all.x=TRUE)
scores.w.blast$hit.count[is.na(scores.w.blast$hit.count)] <- 0
qplot(x=PC1, y=PC2, data=scores.w.blast, col=hit.count > 0, alpha=av.id)
nohits <- names(Tn004.fasta)[!names(Tn004.fasta) %in% blast.hit.count$Query]
writeXStringSet(Tn004.fasta[nohits], file='../../test_noHit.fasta', format = 'fasta', width=60)
tet <- oligonucleotideFrequency(Tn004.fasta, 4, as.prob = T)
rownames(tet) <- names(Tn004.fasta)
cl <- hclust(dist(tet), method='ward.D2')
plot(as.dendrogram(cl), leaflab='none')
cl.split <- cutree(cl, 4)
table(cl.split)
## cl.split
## 1 2 3 4
## 1168 621 488 136
tet.pca <- prcomp(tet, center = T, scale. = T)
plot(tet.pca, type='l')
scores <- data.frame(rownames(tet), tet.pca$x[,1:3], length=width(Tn004.fasta), cluster=cl.split)
qplot(x=PC1, y=PC2, data=scores, col=as.factor(cluster))
qplot(x=PC1, y=PC3, data=scores, col=as.factor(cluster))
qplot(x=PC2, y=PC3, data=scores, col=as.factor(cluster))
clusterData <- data.frame(Chrom = gsub('\\|size[0-9]+', '', names(Tn004.fasta)),
cluster = cl.split)
allData <- merge(subset(fullSummary, Sample=='Tn004'), clusterData, by = 'Chrom', all.x=T)
allData[is.na(allData$cluster),'cluster'] <- 0
#Coverage distributions
ggplot(allData, aes(x=medCov)) + geom_histogram(binwidth=5, alpha=0.2, col='black') + scale_x_continuous(limits = c(0, 150)) + geom_freqpoly(aes(x=medCov, col=as.factor(cluster)), binwidth=5, size=2)
## Warning: Removed 148 rows containing non-finite values (stat_bin).
## Warning: Removed 148 rows containing non-finite values (stat_bin).
## Warning: Removed 8 rows containing missing values (geom_path).
#SNP rate distrubutions
ggplot(allData, aes(x=SNPrate*100)) + geom_histogram(binwidth=0.5, alpha=0.2, col='black') + geom_freqpoly(aes(x=SNPrate*100, col=as.factor(cluster)), binwidth=0.5, size=2) + scale_x_continuous(limits = c(0,10))
## Warning: Removed 38 rows containing non-finite values (stat_bin).
## Warning: Removed 38 rows containing non-finite values (stat_bin).
## Warning: Removed 8 rows containing missing values (geom_path).
#blast results
blast.hit.count$Query <- gsub('\\|size[0-9]*', '', blast.hit.count$Query)
allData.w.blast <- merge(allData, blast.hit.count, by.x='Chrom', by.y='Query', all.x=TRUE)
allData.w.blast$hit.count[is.na(allData.w.blast$hit.count)] <- 0
ggplot(allData.w.blast, aes(x=medCov)) + geom_histogram(binwidth=5, alpha=0.2, col='black') + scale_x_continuous(limits = c(0, 150)) + geom_freqpoly(aes(x=medCov, col=hit.count>0), binwidth=5, size=2)
## Warning: Removed 148 rows containing non-finite values (stat_bin).
## Warning: Removed 148 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).
ggplot(allData.w.blast, aes(x=SNPrate*100)) + geom_histogram(binwidth=0.5, alpha=0.2, col='black') + scale_x_continuous(limits = c(0, 10)) + geom_freqpoly(aes(x=SNPrate*100, col=hit.count>0), binwidth=0.5, size=2)
## Warning: Removed 38 rows containing non-finite values (stat_bin).
## Warning: Removed 38 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).
ggplot(allData.w.blast, aes(x=ContigLength)) + geom_histogram(alpha=0.2, col='black') + geom_freqpoly(aes(x=ContigLength, col=hit.count>0), size=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Remove cluster 4
allData.ss <- subset(allData.w.blast, cluster!=4)
ggplot(allData.ss, aes(x=medCov)) + geom_histogram(binwidth=5, alpha=0.2, col='black') + scale_x_continuous(limits = c(0, 150)) + geom_freqpoly(aes(x=medCov, col=hit.count>0), binwidth=5, size=2)
## Warning: Removed 146 rows containing non-finite values (stat_bin).
## Warning: Removed 146 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).
ggplot(allData.ss, aes(x=SNPrate*100)) + geom_histogram(binwidth=0.5, alpha=0.2, col='black') + scale_x_continuous(limits = c(0, 10)) + geom_freqpoly(aes(x=SNPrate*100, col=hit.count>0), binwidth=0.5, size=2)
## Warning: Removed 38 rows containing non-finite values (stat_bin).
## Warning: Removed 38 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).
ggplot(allData.ss, aes(x=ContigLength)) + geom_histogram(alpha=0.2, col='black') + geom_freqpoly(aes(x=ContigLength, col=hit.count>0), size=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Merge with cluster data (keep only SNPs that are observed on both strands)
allSNPs <- merge(subset(snpSummary, Strands1 == 2 & Strands2 == 2), clusterData, by = 'Chrom', all.x=T)
ggplot(allSNPs, aes(x=MinorAlleleFreq)) + geom_histogram(binwidth=1, alpha=0.2, col='black') + geom_freqpoly(aes(x=MinorAlleleFreq, col=as.factor(cluster)), binwidth=1, size=2)
allSNPs.w.blast <- merge(allSNPs, blast.hit.count, by.x='Chrom', by.y='Query', all.x=TRUE)
allSNPs.w.blast$hit.count[is.na(allSNPs.w.blast$hit.count)] <- 0
ggplot(allSNPs.w.blast, aes(x=MinorAlleleFreq)) + geom_histogram(binwidth=1, alpha=0.2, col='black') + geom_freqpoly(aes(x=MinorAlleleFreq, col=hit.count>0), binwidth=1, size=2)
#Remove cluster 4
allSNPs.ss <- subset(allSNPs.w.blast, cluster != 4)
ggplot(allSNPs.ss, aes(x=MinorAlleleFreq)) + geom_histogram(binwidth=1, alpha=0.2, col='black') + geom_freqpoly(aes(x=MinorAlleleFreq, col=hit.count>0), binwidth=1, size=2)
#Tn004.cluster5 = Tn004.fasta[cl.split==5]
#writeXStringSet(Tn004.cluster5, file='Tn004_odd_cluster.fasta', format = 'fasta', width=60)